A presentation example for teaching non-statistics students with no previous university-level statistics or math courses. Designed specifically for the interview process for the Department Statistical Science at the University of Toronto.
Let’s get some Ice Cream Sales data from the Federal Reserve! A.K.A. IPN31152N:
lubridate library is representative of standard manipulation functionality# https://fred.stlouisfed.org/series/IPN31152N
i_scream <- read_csv("IPN31152N.csv")
# https://lubridate.tidyverse.org
# https://stackoverflow.com/questions/33221425/how-do-i-group-my-date-variable-into-month-year-in-r
i_scream <- i_scream %>%
mutate(month = lubridate::month(DATE, label = TRUE))
Here are the monthly totals of Shark Attacks, and the average Industrial Ice Cream Production (IPN31152N index).
tidy does standard group_by and inner_join table operations| Month | Total Shark Attacks | Average Ice Cream Sales |
|---|---|---|
| Apr | 156 | 119.34282 |
| Aug | 319 | 125.16183 |
| Dec | 62 | 79.48116 |
| Feb | 57 | 102.87803 |
| Jan | 51 | 86.83899 |
| Jul | 343 | 130.87839 |
| Jun | 234 | 138.61432 |
| Mar | 96 | 114.67007 |
| May | 149 | 122.50903 |
| Nov | 116 | 85.92279 |
| Oct | 198 | 97.61893 |
| Sep | 284 | 110.95583 |
# , fig.height=7, fig.width=7, out.width="75%"}
# https://stackoverflow.com/questions/<below>
# 14942681/change-size-of-axes-title-and-labels-in-ggplot2
# 28243514/ggplot2-change-title-size
# 23527385/place-y-axis-on-the-right
ggplot(sharks_i_scream,
mapping=aes(x=`Average Ice Cream Sales`,
y=`Total Shark Attacks`,
label=Month)) +
geom_smooth(method='lm') + geom_text() +
ggtitle('Sharks attacks are more prevalent when we eat more Ice Cream!') +
annotate("text", x=90, y=370, size=6,
label='Duh-Duh, Duh-Duh...', color='gray') +
annotate("text", x=128, y=-20, size=6,
label='Nom! Nom! Nom! Nom!', color='gray') +
theme(axis.title=element_text(size=14),
plot.title=element_text(size=15)) +
scale_y_continuous(position = "right")
How would ice cream production mechanistically influence shark attacks?
library(gridExtra)
a_plot <- sharks_i_scream %>%
ggplot(mapping=aes(x=`Average Ice Cream Sales`,
y=`Total Shark Attacks`)) +
geom_point() + geom_smooth(method='lm') +
coord_cartesian(ylim=c(50, 340)) +
theme(axis.title=element_text(size=12),
plot.title=element_text(size=15))
#no_ticks <- theme(axis.text.x = element_blank(),
# axis.text.y = element_blank())
# http://www.sthda.com/english/articles/24-ggpubr-publication-ready-plots/81-ggplot2-easy-way-to-mix-multiple-graphs-on-the-same-page/
grid.arrange(grobs = list(a_plot + geom_text(aes(label=Month),
vjust="inward", hjust="inward") +
ggtitle("(A) Counfounding"),
a_plot + labs(x='Coffeine Consumption',
y='Productivity') +
ggtitle("(B) Causality"),
a_plot + labs(x='Height', y='Weight') +
ggtitle("(C) Complexity"),
a_plot + labs(x='Dosage', y='Response') +
ggtitle("(D) Intervention")),
nrow=2, aspect=TRUE)
# install.packages('latex2exp')
library(latex2exp)
a_plot <- sharks_i_scream %>%
ggplot(mapping=aes(x=`Average Ice Cream Sales`,
y=`Total Shark Attacks`)) + geom_point()
# http://www.sthda.com/english/articles/24-ggpubr-publication-ready-plots/81-ggplot2-easy-way-to-mix-multiple-graphs-on-the-same-page/
grid.arrange(a_plot + labs(x=latex2exp::TeX('Uncontrolled Observed Treatment $(X)$'),
y=latex2exp::TeX('Observed Outcome $(Y)$')) +
ggtitle("(E) Observational Study"),
a_plot + labs(x=latex2exp::TeX('Controlled Treatment Assignment $(T)$'),
y=latex2exp::TeX('Observed Outcome $(Y)$')) +
ggtitle("(F) Controlled Experiment"),
ncol=2, respect=TRUE)
Is there something that can guess the Intervention?
\[\huge \textrm{ Is } f(X|Y) \textrm{ actually } f(X|Y,Z)?\]
\[\huge f(X|Y,Z) = f(X|Y) = f(T|Y)\]
Bonus: this concept is called propensity scores!
sharks_i_scream %>% rename(Treatment=`Average Ice Cream Sales`,
Outcome=`Total Shark Attacks`) %>%
mutate(Confounder=Treatment+Outcome) -> XYZ
# https://plotly.com/r/3d-subplots/
ax <- list(
title = 'Confounder',
zeroline = FALSE,
showline = FALSE,
showticklabels = FALSE,
showgrid = FALSE
)
# https://community.plotly.com/t/droplines-from-points-in-3d-scatterplot/4113/11
XYZ_floor <- replicate(2, XYZ, simplify = F)
XYZ_floor[[2]]$Confounder <- 0
XYZ_floor <- XYZ_floor %>% bind_rows()
scene1 = list(camera=list(eye=list(x=-.000001, y=-.01, z=3)),
zaxis=ax)
plot_ly(scene='scene1', showlegend=FALSE) %>%
add_markers(data=XYZ, marker=list(color='black'),
x=~Treatment, y=~Outcome, z=~Confounder) %>%
add_paths(data=XYZ_floor, line=list(color='black'),
x=~Treatment, y=~Outcome, z=~Confounder) -> confounder
XYZ %>% mutate(Confounder=100+0*Confounder) -> XYZ2
XYZ2_floor <- replicate(2, XYZ2, simplify = F)
XYZ2_floor[[2]]$Confounder <- 0
XYZ2_floor <- XYZ2_floor %>% bind_rows()
XYZ %>% mutate(Confounder=50+0*Confounder) -> XYZ2
axx <- list(title = "Treatment", automargin = TRUE)
axy <- list(title = "Outcome", automargin = TRUE)
scene2 = list(camera=list(eye=list(x=-.000001, y=-.01, z=3)),
zaxis=ax, xaxis=axx, yaxis=axy)
plot_ly(scene='scene2', showlegend=FALSE) %>%
add_markers(data=XYZ2, marker=list(color='green'),
x=~Treatment, y=~Outcome, z=~Confounder) %>%
add_paths(data=XYZ2_floor, line=list(color='green'),
x=~Treatment, y=~Outcome, z=~Confounder) -> no_confounder
subplot(confounder, no_confounder) %>%
layout(scene=scene1, scene2=scene2, margin = list(pad = 10))
XYZ %>% mutate(Confounder=100+0*Confounder) -> XYZ2
XYZ2 %>% add_column(size=.1, color='green') -> XYZ2
n <- 20
XYZ2_floor <- replicate(n, XYZ2, simplify = F)
for(i in 1:20){
if(i<=10)
XYZ2_floor[[i]]$Confounder <- (i-1)*100/n
else
XYZ2_floor[[i]]$Confounder <- i*100/n
}
XYZ2_floor <- XYZ2_floor %>% bind_rows()
XYZ %>% mutate(Confounder=50+0*Confounder) -> XYZ2
XYZ2 %>% add_column(size=10, color='blue') -> XYZ2
axx <- list(title = "Treatment", automargin = TRUE)
axy <- list(title = "Outcome", automargin = TRUE)
scene1 = list(camera=list(eye=list(x=-.000001, y=-.01, z=3)),
zaxis=ax, xaxis=axx, yaxis=axy)
plot_ly(scene='scene1', showlegend=FALSE) %>%
add_markers(data=XYZ2_floor, size=~size, sizes=c(10,25),
marker=list(symbol='circle', sizemode='diameter',
color='blue'),
x=~Treatment, y=~Outcome, z=~Confounder) %>%
add_markers(data=XYZ2, size=~size, sizes=c(10,25),
marker=list(symbol='circle', sizemode='diameter',
color='green'),
x=~Treatment, y=~Outcome, z=~Confounder) -> uni
XYZ %>% mutate(Confounder=100+0*Confounder) -> XYZ2
XYZ2 %>% add_column(size=.1, color='green') -> XYZ2
n <- 20
XYZ2_floor <- replicate(n, XYZ2, simplify = F)
for(i in 1:20){
if(i<=10)
XYZ2_floor[[i]]$Confounder <- rnorm(12,50,20)
else
XYZ2_floor[[i]]$Confounder <- rnorm(12,50,20)
}
XYZ2_floor <- XYZ2_floor %>% bind_rows()
XYZ %>% mutate(Confounder=50+0*Confounder) -> XYZ2
XYZ2 %>% add_column(size=10, color='blue') -> XYZ2
axx <- list(title = "Treatment", automargin = TRUE)
axy <- list(title = "Outcome", automargin = TRUE)
scene2 = list(camera=list(eye=list(x=-.000001, y=-.01, z=3)),
zaxis=ax, xaxis=axx, yaxis=axy)
plot_ly(scene='scene2', showlegend=FALSE) %>%
add_markers(data=XYZ2_floor, size=~size, sizes=c(10,25),
marker=list(symbol='circle', sizemode='diameter',
color='blue'),
x=~Treatment, y=~Outcome, z=~Confounder) %>%
add_markers(data=XYZ2, size=~size, sizes=c(10,25),
marker=list(symbol='circle', sizemode='diameter',
color='green'),
x=~Treatment, y=~Outcome, z=~Confounder) -> rand
subplot(uni, rand) %>%
layout(scene=scene1, scene2=scene2, margin=list(pad=0))
One More Important Benefit: by randomly sampling from from the "population we also make our treatment groups representative of that population
A presentation example for teaching Upper-Year Statistics Major students. Designed specifically for the interview process for the Department Statistical Science at the University of Toronto.
\[ \begin{align*} \hline \log \prod_{i=1}^n \frac{1}{\sqrt{2\sigma}}e^{-\frac{1}{2}\left(\frac{y_i-\mathbf{x}_i^\intercal \boldsymbol{\beta}}{ \Large \sigma}\right)^2} {} = & \sum_{i=1}^n \log \frac{1}{\sqrt{2 \sigma}} - \sum_{i=1}^n \frac{1}{2}\left(\frac{y_i-\mathbf{x}_i^\intercal \boldsymbol{\beta}}{\large \sigma}\right)^2 \\\\\hline {}\\ \log \left(\left( \prod_{i=1}^n \frac{1}{\sqrt{2\sigma}}e^{-\frac{}{2}\left(\frac{y_i-\mathbf{x}_i^\intercal \boldsymbol{\beta}}{\large \sigma}\right)^2} \right) (2\sigma_0)^{-p/2} e^{-\frac{(\boldsymbol{\beta}-\boldsymbol{\beta}_0)^\intercal(\boldsymbol{\beta}-\boldsymbol{\beta}_0)}{2\sigma_0^p}} \right) {} = & \sum_{i=1}^n \log \frac{1}{\sqrt{2\sigma}} - \sum_{i=1}^n \frac{1}{2}\left(\frac{y_i-\mathbf{x}_i^\intercal \boldsymbol{\beta}}{\large \sigma}\right)^2 + \\ & {} \log \left((2\sigma_0)^{-p/2}\right) - \frac{(\boldsymbol{\beta}-\boldsymbol{\beta}_0)^\intercal(\boldsymbol{\beta}-\boldsymbol{\beta}_0)}{2\sigma_0^p} \\\\\hline \\ \Large \textrm{Loss} + \textrm{Penalty} \quad\quad\quad \normalsize {} = & \sum_{i=1}^n \frac{1}{2}\left(y_i-\mathbf{x}_i^\intercal \boldsymbol{\beta}\right)^2 + \lambda \sum_{k=1}^{p} \beta_k^2 \\\hline \end{align*} \]
\[ \begin{align*} \hline \log \prod_{i=1}^n \frac{1}{\sqrt{2\sigma}}e^{-\frac{1}{2}\left(\frac{y_i-\mathbf{x}_i^\intercal \boldsymbol{\beta}}{\Large \sigma}\right)^2} {} = & \sum_{i=1}^n \log \frac{1}{\sqrt{2 \sigma}} - \sum_{i=1}^n \frac{1}{2}\left(\frac{y_i-\mathbf{x}_i^\intercal \boldsymbol{\beta}}{\large \sigma}\right)^2 \\\\\hline {}\\ \log \left(\left( \prod_{i=1}^n \frac{1}{\sqrt{2\sigma}}e^{-\frac{}{2}\left(\frac{y_i-\mathbf{x}_i^\intercal \boldsymbol{\beta}}{\large \sigma}\right)^2} \right) \prod_{k=1}^p \frac{1}{2b_{k0}} e^{-\frac{|\beta_k-\beta_{k0}|}{b_{k0}}} \right) {} = & \sum_{i=1}^n \log \frac{1}{\sqrt{2\sigma}} - \sum_{i=1}^n \frac{1}{2}\left(\frac{y_i-\mathbf{x}_i^\intercal \boldsymbol{\beta}}{\large \sigma}\right) + \\ & {} \sum_{k=1}^{p} \log \frac{1}{2b_{k0}} - \sum_{k=1}^{p} \frac{|\beta_k-\beta_{k0}|}{b_{k0}}\\\\\hline \\ \Large \textrm{Loss} + \textrm{Penalty} \quad\; \normalsize {} = & \sum_{i=1}^n \frac{1}{2}\left(y_i-\mathbf{x}_i^\intercal \boldsymbol{\beta}\right)^2 + \lambda \sum_{k=1}^{p} |\beta_k| \\\hline \end{align*} \]
| (Handling Sparsity via the Horseshoe) |
| (Bayesian regularization: From Tikhonov to horseshoe) |
\[ \huge \begin{align*} \beta_i|\lambda_i,\tau &\sim N(0, \sigma^2=\tau^2\lambda_i^2)\\ \lambda_i &\sim HC_+(1)\\ \\\hline \\ \beta_i | \tau & \sim {} HSP(\tau)\\ \end{align*} \]
\[\Large \begin{align*} \underset{\in \mathbb{R}_+}{\lambda_i} & \sim {} HC_+(\gamma) \\ f(\lambda_i \mid \gamma) & = {} \frac{2\cdot 1_{[\lambda_i>=0]}(\lambda_i)}{\pi \gamma \left[1 + \left(\frac{\lambda_i}{\gamma}\right)^2\right]} \end{align*} \]
The posterior distribution of a normal-normal specification is well-known:
\[ \Large \begin{align*} Y_i|\beta_i &\sim N(\beta_i, \sigma^2_Y=1)\\ \beta_i|\lambda_i,\tau=1 &\sim N(0, \sigma^2=\lambda_i^2)\\\\ \implies \beta_i|Y_i, \lambda_i,\tau=1 & \sim {} N \left(Y_i\frac{1}{\frac{1}{\lambda_i^2} + 1}, \sigma^2=\frac{1}{\frac{1}{\lambda_i^2} + 1} \right)\\ \\ \hline\\ \implies E[\beta_i|Y_i, \lambda_i] & = {} Y_i\frac{\lambda_i^2}{1 + \lambda_i^2} \\ & = {} Y_i\frac{\lambda_i^2}{1 + \lambda_i^2} + 0 \underbrace{\left(\frac{1}{1 + \lambda_i^2} \right)}_{\kappa_i}\\ & = {} Y_i (1 - \kappa_i) + 0 \kappa_i \end{align*} \]
\[ \Large \begin{align*} \underset{\in[0,1]}{\kappa_i} = \frac{1}{1+\underset{\in \mathbb{R}_+}{\lambda_i^2}} & \implies {} \frac{1}{\kappa_i}-1 = \lambda_i^2 \\ & \implies {} \lambda_i = \sqrt{\frac{1}{\kappa_i}-1}\\\\\hline \\ \frac{d \lambda_i}{d \kappa_i} & \underset{rule}{\overset{chain}{=}} {} \frac{1}{2}\left(\frac{1}{\kappa_i}-1\right)^{-\frac{1}{2}} \left(-1\kappa_i^{-2}\right) \\\\\hline \end{align*} \]
\[ \large \begin{align*} & \quad\;\; {} \int_{a\in \mathbb{R}_+}^{b\geq a} f_{_{\lambda_i}}(\lambda_i) d \lambda_i\\ & ={} \int_{\frac{1}{1+a^2} \in [0,1]}^{0 \leq \frac{1}{1+b^2} \leq \frac{1}{1+a^2}} f_{_{\lambda_i}}\left(\sqrt{\frac{1}{\kappa_i}-1}\right) \frac{d \lambda_i}{d \kappa_i} d \kappa_i\\ & ={} - \int^{\frac{1}{1+a^2} \in [0,1]}_{0 \leq \frac{1}{1+b^2} \leq \frac{1}{1+a^2}} f_{_{\lambda_i}}\left(\sqrt{\frac{1}{\kappa_i}-1}\right) \frac{1}{2}\left(\frac{1}{\kappa_i}-1\right)^{-\frac{1}{2}} (-1\kappa_i^{-2}) d \kappa_i\\ & ={} - \int^{\frac{1}{1+a^2} \in [0,1]}_{0 \leq \frac{1}{1+b^2} \leq \frac{1}{1+a^2}} \frac{2}{\pi \left[1 + \left(\sqrt{\frac{1}{\kappa_i}-1}\right)^2\right]} \frac{1}{2}\left(\frac{1}{\kappa_i}-1\right)^{-\frac{1}{2}} (-1\kappa_i^{-2}) d \kappa_i\\ & ={} \int^{\frac{1}{1+a^2} \in [0,1]}_{0 \leq \frac{1}{1+b^2} \leq \frac{1}{1+a^2}} \frac{1}{\pi } \left(\frac{1-\kappa_i}{\kappa_i}\right)^{-\frac{1}{2}} \kappa_i^{-1} d \kappa_i \\\\ & ={} \int^{\frac{1}{1+a^2} \in [0,1]}_{0 \leq \frac{1}{1+b^2} \leq \frac{1}{1+a^2}} \frac{\Gamma(1)}{\Gamma(\frac{1}{2}) \Gamma(\frac{1}{2})} (1-\kappa_i)^{\frac{1}{2}-1} {\kappa_i}^{\frac{1}{2}-1} d \kappa_i\\\\\hline \end{align*} \]
Reticulate!reticulate lets me run a python in R!library(reticulate)
# conda_list()
use_condaenv("PyMC", required=TRUE)
Python!scipy.stats libraryimport numpy as np
from scipy import stats
support_beta = np.linspace(0,1,100)[1:-1]
pdf_beta = stats.beta(a=1/2, b=1/2).pdf(support_beta)
python\[\begin{align*}\\ \Large g_{\kappa_i}(\kappa_i) = f_{\lambda_i} \left(\lambda_i = \sqrt{\frac{1}{\kappa_i}-1}\right) \left|\frac{d \lambda_i}{d\kappa_i}\right| \end{align*}\]
support_cauchy = (1/support_beta-1)**0.5
pdf_cauchy = stats.halfcauchy.pdf(support_cauchy)
abs_dx_wrt_dy = .5*(1/support_beta-1)**(-.5) * support_beta**(-2)
pdf_transformed_cauchy = pdf_cauchy * abs_dx_wrt_dy
R!R for interactive use
reticulate python sessions in RStudio are still very R-centric
matplotlib can be output directly, but not plotlyx <- py$support_beta
beta_pdf <- py$pdf_beta
transformed_cauchy_pdf <- py$pdf_transformed_cauchy
tibble(x=x, `f(x)`=beta_pdf) %>%
ggplot(aes(x=x, y=`f(x)`, color='Beta')) +
geom_line(size=3) +
ggtitle(latex2exp::TeX(
'$\\beta = \\alpha = -\\frac{1}{2}$ Beta Distribution')) +
geom_line(data=tibble(x=x, y=transformed_cauchy_pdf),
mapping=aes(x=x, y=y, color='Transformed Cauchy'),
linetype='dashed', size=2) +
scale_color_manual(values=c("black","yellow"))
NCI60MELANOMA (\(n=8\)) and RENAL (\(n=9\)) cancer cell lineslibrary(ISLR)
nci60 <- NCI60
nci60$data[nci60$labs=='RENAL',] %>% t() %>%
as_tibble() %>%
rowid_to_column() %>% rename(gene=rowid) %>%
mutate(gene=as.factor(gene)) %>%
pivot_longer(!gene) %>%
add_column(cell = 'RENAL') -> nci60_RENAL
nci60$data[nci60$labs=='MELANOMA',] %>% t() %>%
as_tibble() %>%
rowid_to_column() %>% rename(gene=rowid) %>%
mutate(gene=as.factor(gene)) %>%
pivot_longer(!gene) %>%
add_column(cell = 'MELANOMA') -> nci60_MELANOMA
nci60_RENAL %>% bind_rows(nci60_MELANOMA) %>%
mutate(cell=as.factor(cell)) %>%
mutate(line = paste(cell,name,sep='_')) %>%
group_by(line) %>%
mutate(value_normalized = qqnorm(value, plot.it=FALSE)$x/2) %>%
ungroup() -> nci60
nci60 %>%
pivot_longer(cols=c(value, value_normalized), names_to="data") %>%
arrange(line) %>%
ggplot(aes(y=line, x=value, fill=cell)) +
geom_violin() +
scale_fill_manual(values=c("purple","blue"),
breaks=c("RENAL","MELANOMA")) +
facet_grid(cols=vars(data))
nci60 %>%
pivot_longer(cols=c(value, value_normalized), names_to="data") %>%
arrange(line) %>%
ggplot(aes(y=line, x=value, fill=cell)) +
geom_violin() + geom_boxplot(width=0.3, fill="yellow") +
scale_fill_manual(values=c("purple","blue"),
breaks=c("RENAL","MELANOMA")) +
facet_grid(cols=vars(data)) + lims(x=c(-1,1))
1:1R data into our python session!print(r.nci60.head())
## gene name value cell line value_normalized
## 0 1 V4 0.28 RENAL RENAL_V4 0.235919
## 1 1 V11 0.27 RENAL RENAL_V11 0.233051
## 2 1 V12 -0.45 RENAL RENAL_V12 -0.480740
## 3 1 V13 -0.03 RENAL RENAL_V13 -0.055993
## 4 1 V14 0.71 RENAL RENAL_V14 0.591575
print(r.nci60.dtypes)
## gene category
## name object
## value float64
## cell category
## line object
## value_normalized float64
## dtype: object
statsmodels!statsmodels (python) library!
lm, personally:
Cond. No.: (Multicollinearity* Issues DiagnosisOmnibus/Jarque-Bera: Normality Assumptions (Skew/Kurtosis)Durbin-Watson: Homoscedasticity (between 1 to 2 is usually “okay”)import statsmodels.formula.api as smf
p = 5
kp = r.nci60['gene'].apply(lambda x: x in [str(i) for i in range(1,p+1)])
data = r.nci60[kp].copy()
data['gene'] = data['gene'].astype(str).astype('category')
results = smf.ols('value_normalized ~ gene*cell', data=data).fit()
results.summary()
## <class 'statsmodels.iolib.summary.Summary'>
## """
## OLS Regression Results
## ==============================================================================
## Dep. Variable: value_normalized R-squared: 0.311
## Model: OLS Adj. R-squared: 0.229
## Method: Least Squares F-statistic: 3.770
## Date: Tue, 09 Feb 2021 Prob (F-statistic): 0.000590
## Time: 09:17:48 Log-Likelihood: -50.797
## No. Observations: 85 AIC: 121.6
## Df Residuals: 75 BIC: 146.0
## Df Model: 9
## Covariance Type: nonrobust
## ===========================================================================================
## coef std err t P>|t| [0.025 0.975]
## -------------------------------------------------------------------------------------------
## Intercept -0.0438 0.166 -0.265 0.792 -0.374 0.286
## gene[T.2] 0.0648 0.234 0.277 0.783 -0.402 0.531
## gene[T.3] 0.0537 0.234 0.229 0.819 -0.413 0.520
## gene[T.4] -0.6879 0.234 -2.938 0.004 -1.154 -0.222
## gene[T.5] 0.1251 0.234 0.534 0.595 -0.341 0.592
## cell[T.RENAL] -0.0440 0.228 -0.193 0.847 -0.497 0.409
## gene[T.2]:cell[T.RENAL] -0.4075 0.322 -1.266 0.209 -1.049 0.233
## gene[T.3]:cell[T.RENAL] -0.0634 0.322 -0.197 0.844 -0.704 0.578
## gene[T.4]:cell[T.RENAL] 0.0596 0.322 0.185 0.854 -0.581 0.701
## gene[T.5]:cell[T.RENAL] 0.0036 0.322 0.011 0.991 -0.637 0.645
## ==============================================================================
## Omnibus: 10.969 Durbin-Watson: 2.032
## Prob(Omnibus): 0.004 Jarque-Bera (JB): 11.591
## Skew: 0.731 Prob(JB): 0.00304
## Kurtosis: 4.064 Cond. No. 15.7
## ==============================================================================
##
## Notes:
## [1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
## """
PyMC!PyMC quite a lot
TensorFlow.probability projectPyMC seems to be a very active and well-supported communitystan but that also seems to be a very good choice
PyMC’s planned integration with TensorFlow
p = 275
kp = r.nci60['gene'].apply(lambda x: x in [str(i) for i in range(1,p+1)])
data = r.nci60[kp].copy()
data['gene'] = data['gene'].astype(str).astype('category')
kp = [str(i) for i in range(1,51)]
for i in range(51,p+1):
xy = data[data.gene==str(i)]
x = xy[xy.cell=='RENAL']['value_normalized']
y = xy[xy.cell=='MELANOMA']['value_normalized']
if stats.ttest_ind(x, y, equal_var=True).pvalue < 0.01:
kp += [str(i)]
data = data[data.gene.apply(lambda x: x in kp)].copy()
data['gene'] = data['gene'].astype(str).astype('category')
import pymc3 as pm
# https://stackoverflow.com/questions/51761599/cannot-find-stdio-h
# https://stackoverflow.com/questions/58278260/cant-compile-a-c-program-on-a-mac-after-upgrading-to-catalina-10-15
# https://stackoverflow.com/questions/58278260/cant-compile-a-c-program-on-a-mac-after-upgrading-to-catalina-10-15/58349403#58349403
# https://github.com/rodluger/starry/issues/261
import pandas as pd
X_gene = pd.get_dummies(data['gene'])
X_cell = pd.get_dummies(data['cell'])
X_gene_RENAL = X_gene.values * X_cell[['RENAL']].values
X_gene_MELANOMA = X_gene.values * X_cell[['MELANOMA']].values
X_gene = X_gene.values
with pm.Model() as model_horseshoe:
tau = 0.05
lambda_0 = 1
lambdas = pm.HalfCauchy('lambdas', beta=lambda_0,
shape=(X_gene_RENAL.shape[1]))
sd_gene = pm.HalfNormal('sd_gene', sd=1, shape=(X_gene.shape[1]))
beta_gene = pm.Normal('beta_gene', mu=0, sd=1,
shape=(X_gene.shape[1]))
beta_gene_RENAL_stn = pm.Normal('beta_gene_RENAL_stn', mu=0, sd=1,
shape=(X_gene_RENAL.shape[1]))
# https://twiecki.io/blog/2017/02/08/bayesian-hierchical-non-centered/
beta_gene_RENAL = pm.Deterministic("beta_gene_RENAL",
beta_gene_RENAL_stn*tau*lambdas)
gene_expression = pm.Normal('gene_expression',
mu = X_gene@beta_gene +\
pm.math.dot(X_gene_RENAL,beta_gene_RENAL),
# https://docs.pymc.io/api/math.html#math
sd = X_gene@sd_gene,
observed=data['value_normalized'])
with model_horseshoe:
posterior_horseshoe = pm.sample()
## █
## Auto-assigning NUTS sampler...
## Initializing NUTS using jitter+adapt_diag...
## Multiprocess sampling (4 chains in 4 jobs)
## NUTS: [beta_gene_RENAL_stn, beta_gene, sd_gene, lambdas]
## Sampling 4 chains for 1_000 tune and 1_000 draw iterations (4_000 + 4_000 draws total) took 490 seconds.
## There were 238 divergences after tuning. Increase `target_accept` or reparameterize.
## There were 136 divergences after tuning. Increase `target_accept` or reparameterize.
## There were 212 divergences after tuning. Increase `target_accept` or reparameterize.
## There were 150 divergences after tuning. Increase `target_accept` or reparameterize.
## The estimated number of effective samples is smaller than 200 for some parameters.
pm.model_to_graphviz(model_horseshoe).render("graphname", format="png")
## 'graphname.png'
with pm.Model() as model:
sd_gene = pm.HalfNormal('sd_gene', sd=1, shape=(X_gene.shape[1]))
beta_gene = pm.Normal('beta_gene', mu=0, sd=10,
shape=(X_gene.shape[1]))
beta_gene_RENAL = pm.Normal('beta_gene_RENAL', mu=0, sd=10,
shape=(X_gene_RENAL.shape[1]))
gene_expression = pm.Normal('gene_expression',
mu = X_gene@beta_gene +\
X_gene_RENAL@beta_gene_RENAL,
sd = X_gene@sd_gene,
observed=data['value_normalized'])
with model:
posterior = pm.sample()
## █
## Auto-assigning NUTS sampler...
## Initializing NUTS using jitter+adapt_diag...
## Multiprocess sampling (4 chains in 4 jobs)
## NUTS: [beta_gene_RENAL, beta_gene, sd_gene]
## Sampling 4 chains for 1_000 tune and 1_000 draw iterations (4_000 + 4_000 draws total) took 139 seconds.
#1-9/(9+ (posterior_horseshoe['sd_gene']/posterior_horseshoe['lambdas'])**2)
post_hs_beta_gene_RENAL = posterior_horseshoe['beta_gene_RENAL'].mean(axis=0)
post_hs_CV = (posterior_horseshoe['beta_gene_RENAL']/
posterior_horseshoe['sd_gene']).mean(axis=0)
post_CV = (posterior['beta_gene_RENAL']/
posterior['sd_gene']).mean(axis=0)
post_beta_gene_RENAL = posterior['beta_gene_RENAL'].mean(axis=0)
ggplot(data=tibble(`Original Inverse CoV` = py$post_CV,
kappa = 1 - (py$post_hs_beta_gene_RENAL/
py$post_beta_gene_RENAL))) +
geom_point(aes(x=`Original Inverse CoV`, y=kappa)) + lims(y=c(0,1)) -> shinkage_ratio
ggplot(data=tibble(`Original Inverse CoV` = py$post_CV,
`Horseshoe Inverse CoV` = py$post_hs_CV) )+
geom_point(aes(x=`Original Inverse CoV`, y=`Horseshoe Inverse CoV`)) +
geom_smooth(aes(x=`Original Inverse CoV`, y=`Horseshoe Inverse CoV`),
formula='y~x', method='loess') +
geom_abline(intercept=0, slope=1) -> shrinkage_scatter
grid.arrange(grobs=list(shinkage_ratio, shrinkage_scatter), nrow=1, aspect=TRUE)